### setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/SSD/spanish_norms/src/analysis/")
### Read in all data
df_final = read_csv("../../data/processed/human/sawc_relatedness_full_critical_data.csv")
nrow(df_final)
## [1] 10639
length(unique(df_final$Participant))
## [1] 131
df_ratio = df_final %>%
group_by(Same_sense) %>%
mutate(count_condition = n()) %>%
ungroup() %>%
group_by(Same_sense, Response, count_condition) %>%
summarise(count_response = n()) %>%
mutate(prop_response = count_response / count_condition)
df_ratio %>%
ggplot(aes(x = Response,
y = prop_response)) +
geom_bar(alpha = .6, stat = "identity") +
theme_minimal() +
labs(x = "Relatedness",
y = "P(Response | Condition)") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
df_final %>%
ggplot(aes(x = Response)) +
geom_bar(alpha = .6, stat = "count") +
theme_minimal() +
labs(x = "Relatedness",
y = "Count") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
mod_full = lmer(data = df_final,
Response ~ Same_sense +
(1 + Same_sense | Participant) +
(1 + Same_sense | List) + (1 | Word),
REML = FALSE)
mod_reduced = lmer(data = df_final,
Response ~ # Same_sense +
(1 + Same_sense | Participant) +
(1 + Same_sense | List) + (1 | Word),
REML = FALSE)
summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 +
## Same_sense | List) + (1 | Word)
## Data: df_final
##
## AIC BIC logLik deviance df.resid
## 34198.6 34271.3 -17089.3 34178.6 10629
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8340 -0.6554 -0.1521 0.6411 3.2534
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Participant (Intercept) 0.13240 0.3639
## Same_senseSame Sense 0.13065 0.3614 -0.59
## Word (Intercept) 0.32042 0.5661
## List (Intercept) 0.02791 0.1671
## Same_senseSame Sense 0.07990 0.2827 -0.55
## Residual 1.35993 1.1662
## Number of obs: 10639, groups: Participant, 131; Word, 102; List, 10
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.11454 0.08478 29.45271 24.94 < 2e-16 ***
## Same_senseSame Sense 2.24722 0.09838 9.67540 22.84 9.65e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Sm_snsSmSns -0.406
anova(mod_full, mod_reduced)
## Data: df_final
## Models:
## mod_reduced: Response ~ (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
## mod_full: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_reduced 9 34236 34302 -17109 34218
## mod_full 10 34199 34271 -17089 34179 39.589 1 3.135e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### BETO distances
df_beto_distances = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv")
nrow(df_beto_distances)
## [1] 10556
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")
### merge together
df_merged = df_beto_distances %>%
left_join(df_sawc_items)
nrow(df_merged)
## [1] 10556
df_list_mean = df_final %>%
group_by(List, Word, Tag) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n())
nrow(df_list_mean)
## [1] 812
df_merged_beto = df_merged %>%
inner_join(df_list_mean)
nrow(df_merged_beto)
## [1] 10556
df_by_layer = df_merged_beto %>%
group_by(Layer) %>%
summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
r2 = r ** 2,
rho = cor(mean_relatedness, Distance, method = "spearman"),
count = n())
summary(df_by_layer$rho)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.58780 -0.57770 -0.56818 -0.48504 -0.47098 0.02152
df_by_layer %>%
ggplot(aes(x = Layer,
y = r)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "Pearson's r") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
df_by_layer %>%
ggplot(aes(x = Layer,
y = rho)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "Spearman's rho") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
df_by_layer %>%
ggplot(aes(x = Layer,
y = r2)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "R2") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
min(df_by_layer$rho)
## [1] -0.5878024
df_by_layer <- df_by_layer %>%
mutate(r2_delta = c(NA, diff(r2))) %>%
mutate(weighted_layer = Layer * r2_delta)
expected_layer = sum(df_by_layer$weighted_layer, na.rm = TRUE) / sum(df_by_layer$r2_delta, na.rm = TRUE)
expected_layer
## [1] 2.454195
df_by_layer %>%
ggplot(aes(x = Layer,
y = r2)) +
geom_line(size = 2,
color = "cornflower blue",
alpha = .8) +
theme_minimal() +
geom_vline(xintercept = expected_layer, size = 1.5,
linetype = "dashed", alpha = .7) +
geom_vline(xintercept = 7, size = 1.5,
linetype = "dotted", alpha = .7) +
scale_x_continuous(breaks = c(0:13)) +
labs(x = "Layer (BETO)",
y = "R2") +
theme(text = element_text(size = 15),
legend.position="none")
Now, we select the best-performing layer from BETO.
df_beto_l5 = df_merged %>%
filter(Layer == 7) %>%
select(-Same_sense)
nrow(df_beto_l5)
## [1] 812
df_experimental_with_beto = df_final %>%
left_join(df_beto_l5)
nrow(df_experimental_with_beto)
## [1] 10639
mod_full = lmer(data = df_experimental_with_beto,
Response ~ Same_sense + Distance +
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
mod_reduced = lmer(data = df_experimental_with_beto,
Response ~ Distance + # Same_sense +
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
mod_just_same = lmer(data = df_experimental_with_beto,
Response ~ Same_sense + # Distance
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Response ~ Same_sense + Distance + (1 + Same_sense + Distance |
## Participant) + (1 | List) + (1 | Word)
## Data: df_experimental_with_beto
##
## AIC BIC logLik deviance df.resid
## 33795.9 33883.2 -16886.0 33771.9 10627
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0633 -0.6341 -0.1164 0.6223 3.5636
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Participant (Intercept) 0.23178 0.4814
## Same_senseSame Sense 0.19301 0.4393 -0.51
## Distance 0.87600 0.9359 -0.69 -0.15
## Word (Intercept) 0.28323 0.5322
## List (Intercept) 0.01825 0.1351
## Residual 1.30359 1.1417
## Number of obs: 10639, groups: Participant, 131; Word, 102; List, 10
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.91931 0.09022 74.77499 32.36 <2e-16 ***
## Same_senseSame Sense 1.87369 0.04923 144.30838 38.06 <2e-16 ***
## Distance -4.26971 0.22414 306.55782 -19.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sm_sSS
## Sm_snsSmSns -0.389
## Distance -0.523 0.300
anova(mod_full, mod_reduced)
## Data: df_experimental_with_beto
## Models:
## mod_reduced: Response ~ Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_reduced 11 34125 34205 -17052 34103
## mod_full 12 33796 33883 -16886 33772 331.55 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod_full, mod_just_same)
## Data: df_experimental_with_beto
## Models:
## mod_just_same: Response ~ Same_sense + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_just_same 11 34002 34082 -16990 33980
## mod_full 12 33796 33883 -16886 33772 208.27 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Visualize
df_experimental_with_beto %>%
mutate(distance_binned = ntile(Distance, 20)) %>%
group_by(Same_sense, distance_binned) %>%
summarize(
mean_relatedness = mean(Response),
sd_relatedness = sd(Response),
count = n(),
se_relatedness = sd_relatedness / sqrt(count),
) %>%
ggplot(aes(x = distance_binned,
y = mean_relatedness,
color = Same_sense,
fill = Same_sense)) +
geom_line(size = 1.5) +
geom_ribbon(aes(ymin = mean_relatedness - se_relatedness,
ymax = mean_relatedness + se_relatedness),
alpha = 0.8,
color = NA) +
labs(x = "BETO Cosine Distance (Binned)",
y = "Relatedness",
color = "Same Sense",
fill = "Same Sense") +
theme_minimal() +
scale_fill_manual(values = my_colors) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
### First, get group means
df_list_mean = df_experimental_with_beto %>%
group_by(List, Word, Tag) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n(),
distance = mean(Distance))
nrow(df_list_mean)
## [1] 812
### Get BETO cor
BETO_COR = abs(cor(df_list_mean$mean_relatedness,
df_list_mean$distance, method = "spearman"))
### Now, iterate through ppts
ppts = unique(df_final$Participant)
df_r = data.frame()
for (ppt in ppts) {
# Subset df_critical for the current participant
individual_data <- df_final %>%
filter(Participant == ppt) %>%
select(List, Word, Tag, Response) # Ensure you're selecting the needed columns
# Merge individual data with mean data
merged_data <- inner_join(individual_data, df_list_mean, by = c("List", "Word", "Tag"))
test = cor.test(merged_data$Response,
merged_data$mean_relatedness,
method = "spearman")
df_test = broom::tidy(test)
df_test$ppt = ppt
df_test$List = unique(individual_data$List)
df_r = rbind(df_r, df_test)
}
### Visualization
df_r %>%
ggplot(aes(x = estimate)) +
geom_histogram(alpha = .6) +
scale_x_continuous(limits = c(0, 1)) +
theme_minimal() +
geom_vline(xintercept = BETO_COR, size = 1.5,
linetype = "dashed", alpha = .7) +
labs(x = "Leave-one-out Correlation") +
theme(text = element_text(size = 15),
legend.position="none")
summary(df_r$estimate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3946 0.7480 0.7962 0.7725 0.8308 0.8834
### What proportion of inter-annotator agreement scores are larger?
prop_larger = df_r %>%
mutate(larger = estimate >= BETO_COR) %>%
summarise(mean(larger))
prop_larger
## # A tibble: 1 × 1
## `mean(larger)`
## <dbl>
## 1 0.947
The actual code implementation is executed in Python, but a visualization is produced here.
# Calculate mean and SE of Distance
df_summary <- df_beto_distances %>%
group_by(Layer, Same_sense) %>%
summarise(
mean_Distance = mean(Distance),
sd_Distance = sd(Distance),
count = n(),
se_Distance = sd_Distance / sqrt(count),
.groups = 'drop' # Drop the automatic grouping by dplyr
)
df_summary %>%
ggplot(aes(x = Layer,
y = mean_Distance,
color = Same_sense,
fill = Same_sense)) +
geom_line(size = 1.5) +
geom_ribbon(aes(ymin = mean_Distance - 2 * se_Distance,
ymax = mean_Distance + 2 * se_Distance),
alpha = 0.8,
color = NA) +
labs(x = "Layer (BETO)",
y = "Mean Distance",
color = "Same Sense",
fill = "Same Sense") +
scale_x_continuous(breaks = c(0:13)) +
theme_minimal() +
# scale_fill_viridis(option = "mako", discrete=TRUE) +
# scale_color_viridis(option = "mako", discrete=TRUE) +
scale_fill_manual(values = my_colors) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
df_item_means = df_final %>%
group_by(List, Word, Same_sense, Sentence_1, Sentence_2,
Sense_id_s1, Sense_id_s2, Gender_s1, Gender_s2) %>%
summarise(mean_relatedness = mean(Response),
sd_relatedness = sd(Response),
median_relatedness = median(Response),
count = n())
df_item_means %>%
ggplot(aes(x = mean_relatedness)) +
geom_histogram(alpha = .6, bins = 10) +
theme_minimal() +
labs(x = "Mean Relatedness",
y = "Count") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
df_item_means %>%
ggplot(aes(x = mean_relatedness,
y = Same_sense,
fill = Same_sense)) +
geom_density_ridges2(aes(height = ..density..),
color=gray(0.25),
alpha = .7,
scale=.85,
# size=1,
size = 0,
stat="density") +
labs(x = "Mean Relatedness",
y = "",
fill = "") +
theme_minimal()+
scale_fill_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="none")
df_beto = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv") %>%
mutate(Model = "BETO-cased",
Multilingual = "Monolingual")
df_xlm = read_csv("../../data/processed/models/sawc-distances_model-xlm-roberta-base.csv") %>%
mutate(Model = "XLM-RoBERTa",
Multilingual = "Multilingual")
df_mb = read_csv("../../data/processed/models/sawc-distances_model-bert-base-multilingual-cased.csv") %>%
mutate(Model = "Multilingual BERT",
Multilingual = "Multilingual")
df_db = read_csv("../../data/processed/models/sawc-distances_model-distilbert-base-spanish-uncased.csv") %>%
mutate(Model = "DistilBETO",
Multilingual = "Monolingual")
df_ab = read_csv("../../data/processed/models/sawc-distances_model-albert-base-spanish.csv") %>%
mutate(Model = "ALBERT-base",
Multilingual = "Monolingual")
df_at = read_csv("../../data/processed/models/sawc-distances_model-albert-tiny-spanish.csv") %>%
mutate(Model = "ALBERT-tiny",
Multilingual = "Monolingual")
df_axl = read_csv("../../data/processed/models/sawc-distances_model-albert-xlarge-spanish.csv") %>%
mutate(Model = "ALBERT-xlarge",
Multilingual = "Monolingual")
df_al = read_csv("../../data/processed/models/sawc-distances_model-albert-large-spanish.csv") %>%
mutate(Model = "ALBERT-large",
Multilingual = "Monolingual")
df_axxl = read_csv("../../data/processed/models/sawc-distances_model-albert-xxlarge-spanish.csv") %>%
mutate(Model = "ALBERT-xxlarge",
Multilingual = "Monolingual")
df_rb = read_csv("../../data/processed/models/sawc-distances_model-roberta-base-bne.csv") %>%
mutate(Model = "RoBERTa-base",
Multilingual = "Monolingual")
df_rl = read_csv("../../data/processed/models/sawc-distances_model-roberta-large-bne.csv") %>%
mutate(Model = "RoBERTa-large",
Multilingual = "Monolingual")
df_beto_uncased = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-uncased.csv") %>%
mutate(Model = "BETO-uncased",
Multilingual = "Monolingual")
df_all = df_beto %>%
bind_rows(df_xlm) %>%
bind_rows(df_db) %>%
bind_rows(df_ab) %>%
bind_rows(df_at) %>%
bind_rows(df_mb) %>%
bind_rows(df_axl) %>%
bind_rows(df_al) %>%
bind_rows(df_axxl) %>%
bind_rows(df_rb) %>%
bind_rows(df_rl) %>%
bind_rows(df_beto_uncased)
nrow(df_all)
## [1] 144536
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")
### merge together
df_merged = df_all %>%
left_join(df_sawc_items)
df_list_mean = df_final %>%
group_by(List, Word, Tag) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n())
nrow(df_list_mean)
## [1] 812
df_merged = df_merged %>%
inner_join(df_list_mean)
nrow(df_merged)
## [1] 144536
df_by_layer = df_merged %>%
group_by(Model, Multilingual, Layer, n_params) %>%
summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
r2 = r ** 2,
rho = cor(mean_relatedness, Distance, method = "spearman"),
count = n())
max(df_by_layer$r2)
## [1] 0.3321961
df_by_layer %>%
filter(r2 == max(df_by_layer$r2))
## # A tibble: 1 × 8
## # Groups: Model, Multilingual, Layer [1]
## Model Multilingual Layer n_params r r2 rho count
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 BETO-cased Monolingual 7 109850880 -0.576 0.332 -0.587 812
df_by_layer %>%
filter(Model != "ALBERT-tiny") %>%
filter(Model != "ALBERT-xlarge") %>%
filter(Model != "DistilBETO") %>%
filter(Model != "ALBERT-large") %>%
filter(Model != "RoBERTa-large") %>%
ggplot(aes(x = Layer,
y = r2,
color = Model,
linetype = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "",
linetype = "") +
scale_x_continuous(breaks = c(0:13)) +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 12),
legend.position="bottom")
df_by_layer %>%
filter(Model != "ALBERT-tiny") %>%
filter(Model != "ALBERT-xlarge") %>%
filter(Model != "DistilBETO") %>%
filter(Model != "ALBERT-large") %>%
filter(Model != "RoBERTa-large") %>%
ggplot(aes(x = Layer,
y = r2,
color = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "") +
scale_x_continuous(breaks = c(0:13)) +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 12),
legend.position="bottom")
df_by_layer %>%
filter(Model %in% c("ALBERT-xlarge",
"ALBERT-large",
"RoBERTa-large")) %>%
ggplot(aes(x = Layer,
y = r2,
color = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "") +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position="bottom")
The BETO/ALBERT/DistilBETO family of models provide a nice within-family comparison of the effect of number of parameters, since they are trained on the same datasets.
df_max_r2 = df_by_layer %>%
group_by(Model, n_params, Multilingual) %>%
summarise(max_r2 = max(r2))
df_max_r2 %>%
ggplot(aes(x = n_params,
y = max_r2,
color = Multilingual,
shape = Multilingual)) +
geom_point(size = 6,
alpha = .7) +
geom_hline(yintercept = mean(df_r$estimate)**2, ### Human accuracy
linetype = "dotted", color = "red",
size = 1.2, alpha = .5) +
geom_text_repel(aes(label=Model), size=3) +
scale_x_log10() +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "Parameters",
y = "Maximum R2",
color = "",
shape = "") +
theme_minimal() +
# guides(color="none") +
# scale_color_viridis(option = "mako", discrete=TRUE) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
Same vs. Different Sense residuals# Fit models and calculate residuals for each LLM
models <- by(df_merged, df_merged$Model, function(subdata) {
model <- lm(mean_relatedness ~ Distance * Layer, data = subdata)
subdata$residuals <- residuals(model)
return(subdata)
})
# Combine the results back into a single dataframe
results <- do.call(rbind, models)
# Modify Same_sense
results = results %>%
mutate(Same_sense = case_when(
Same_sense == TRUE ~ "Same Sense",
Same_sense == FALSE ~ "Different Sense"
))
results %>%
ggplot(aes(x = residuals,
fill = Same_sense)) +
geom_density(alpha = .7, size = 0) +
geom_vline(xintercept = 0, size = .6, linetype = "dashed") +
theme_minimal() +
# scale_fill_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position="bottom") +
scale_fill_manual(values = my_colors) +
labs(x = "Residuals (Relatedness ~ Distance * Layer)",
fill = "") +
facet_wrap(~Model)